Raw data preprocessing

Raw data downloaded from GEO series GSE47460 is preprocessed in bash using a set of commands found on this git.

# This first part is in shell and requires to unzip the raw data TAR and then gunzip all the files contained in the tar

for i in *txt; do mv "$i"  `echo $i | sed -E s/'GSM[0-9]+_'/''/g`; done

for f in data/LT*.txt; do 
    rm -f a.tmp
     awk 'BEGIN { seen=0;FS=OFS="\t" }
         { if($1=="FEATURES"){ seen=1 }
         if(seen==1){ print $0; }}' $f > a.tmp
     nlines=$(wc -l "a.tmp" | awk '{ print $1 }')
     n=$(($nlines - 1))
     mv a.tmp data/scrubbed/`basename $f .txt`.${n}.txt
   echo $f $n
 done


# for each chip used, create a target file.
 
 printf "FileName\tCondition\n" > data/scrubbed/Targets_45015.txt
 paste <(ls data/scrubbed/*.45015*) <(ls data/scrubbed/*.45015* | sed -E s#'^data/scrubbed/LT[A-Z0-9]+_'#''#g | sed s/.45015.txt//g) >> data/scrubbed/Targets_45015.txt
 printf "FileName\tCondition\n" > data/scrubbed/Targets_62976.txt
 paste <(ls data/scrubbed/*.62976*) <(ls data/scrubbed/*.62976* | sed -E s#'^data/scrubbed/LT[A-Z0-9]+_'#''#g | sed s/.62976.txt//g) >> data/scrubbed/Targets_62976.txt

# make directories to save plots and tables

 mkdir Lung_Function_Correlations
    mkdir Lung_Function_Correlations/predicted_dlco
    mkdir Lung_Function_Correlations/predicted_fev1_postbd
    mkdir Lung_Function_Correlations/predicted_fev1_prebd
    mkdir Lung_Function_Correlations/predicted_fvc_postbd
    mkdir Lung_Function_Correlations/predicted_fvc_prebd

 mkdir Fibrosis_Genes_Correlations
    mkdir Fibrosis_Genes_Correlations/FN1
    mkdir Fibrosis_Genes_Correlations/ACTA2
    mkdir Fibrosis_Genes_Correlations/COL3A1
    mkdir Fibrosis_Genes_Correlations/COL1A1
    mkdir Fibrosis_Genes_Correlations/COL1A2
    mkdir Fibrosis_Genes_Correlations/TIMP1
    mkdir Fibrosis_Genes_Correlations/MMP2
    mkdir Fibrosis_Genes_Correlations/MMP9

Libraries and functions

libs = c("limma", "RVAideMemoire", "pspearman", "tidyverse", "openxlsx", "biomaRt", "beeswarm", "nortest", "gplots", "pheatmap", "viridis", "robust")
suppressMessages({lapply(libs, require, character.only = T)})
source("helper_functions.R")

Targets and metadata

We read the targets.txt file that was created earlier in bash. We then create a metadata table in the targets dataframe in which more information is stored.

## Set chunk as not yet done
dotwo = TRUE

## Load human ensembl biomart 
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "www.ensembl.org", ensemblRedirect = FALSE)

## Load conversion table
bm <- getBM(attributes=c("ensembl_gene_id", "refseq_mrna", "hgnc_symbol" ), mart=mart) %>%
  distinct() %>% 
  as_tibble() %>%
  na_if("") %>% 
  na.omit()
bm = as.data.frame(bm)

## Load microarray targets 
GROUP="62976"
targets = readTargets(paste("data/scrubbed/Targets_", GROUP, ".txt", sep=""))
targets$Title = as.character(sapply(targets$FileName, function(x) gsub(x, pattern = "data/scrubbed/", replacement = "", fixed = T)))
targets$Title = as.character(sapply(targets$Title, function(x) gsub(x, pattern = ".62976.txt", replacement = "", fixed = T)))

## Read demographics table taken from NCBI
messy <- read.xlsx("pdata.xlsx", fillMergedCells = TRUE)

## Create a column for each characteristic
tidyup <- messy %>% 
            separate(Characteristics, into = c("variable", "value"), sep = ": ") %>% 
            spread(variable, value) 

colnames(tidyup)[9:15] = c("emphysema_f950", "predicted_dlco", "predicted_fev1_postbd", "predicted_fev1_prebd", "predicted_fvc_postbd", "predicted_fvc_prebd", "smoker")
tidyup[,16] <- NULL
targets = as.data.frame(cbind(targets, tidyup))
targets[is.na(targets$Ild.subtype),'Ild.subtype'] = 'CTRL'

## Set chunk as done
dotwo = FALSE

Subsetting by condition: CTRL and IPF only

We only take microarrays from the ILD (Interstitial Lung Disease) condition, with subtype “UIP/IPF”. We read the Agilent array images, correct the background, normalize using quantile normalization, then log-transform the data. Replicates are averaged.

## Subset for CTRL and Interstitial Lung Disease patients
targets_2 = targets[ targets$Condition %in% c("CTRL", "ILD"),]

## Subset for CTRL and IPF
targets_2 = targets_2[ targets_2$Ild.subtype %in% c("CTRL", "2-UIP/IPF"),]
## Set chunk as not yet done
dothree = TRUE

## Read all agilent microarray files
dat3 = read.maimages(files=targets_2$FileName, path=".", source="agilent.median", green.only=T,
    columns=list(G="gMedianSignal", Gb="gBGMedianSignal"), 
    annotation=c("Row", "Col", "ProbeName", "SystematicName")
)

## Background correction
dat3 <- backgroundCorrect(dat3, method="normexp", offset=1)

## Normalization
dat3$E <- normalizeBetweenArrays(dat3$E, method="quantile")

## Log-transformation
dat3$E <- log2(dat3$E)
 #very important: E and M are identical for a 1 color array
E3 = new("MAList", list(targets=dat3$targets, genes=dat3$genes, source=dat3$source, M=dat3$E, A=dat3$E))

## Average replicates
E3.avg <- avereps(E3, ID=E3$genes$SystematicName)

# make the design matrix
colnames(E3.avg$A) = paste(colnames(E3.avg$A), "txt", sep = '.')

## Set chunk as done
dothree = FALSE

limma differential expression analysis

A linear model is fit, correcting for age and sex. Then the empirical Bayes shrinkage is applied. We then visualize the results for IL11.

## Limma analysis: set factors, levels and model matrix 
d3.levels = unique(targets_2$Condition)
d3.factor = factor(targets_2$Condition, levels=d3.levels)
d3.factor = relevel(d3.factor, "CTRL")
d3.age = as.numeric(targets_2$Age)

d3.sex = factor(targets_2$Sex, levels = unique(targets_2$Sex))
d3.sex = gsub(d3.sex, pattern = "1-Male", replacement = "Male")
d3.sex = gsub(d3.sex, pattern = "2-Female", replacement = "Female")
d3.sex = factor(d3.sex)

d3.design = model.matrix(~ 0 + d3.factor + d3.age + d3.sex)

## Fit the linear model
fit.3 = lmFit(E3.avg$A, d3.design)

## Contrast between IPF and CTRL
contrast.matrix3 <- makeContrasts("d3.factorILD-d3.factorCTRL", levels=d3.design)
fit.3 <- contrasts.fit(fit.3, contrast.matrix3)

## Apply empirical Bayes shrinkage
fit.3.2 <- eBayes(fit.3)

## Adjust p-values and sort
tt_ILD_3 = topTable(fit.3.2, adjust = 'fdr', coef = 1, genelist = E3.avg$genes, number = Inf)

## Results for IL11
tt_ILD_3[tt_ILD_3$SystematicName == 'NM_000641',]
## Plot log-normalized average IL11 expression in CTRL and IPF
beeswarm(E3.avg$A['NM_000641',targets_2$FileName] ~ targets_2$Condition, method = 'hex', pch = 21, col = c("purple", "salmon"), cex = 0.6, bg = 'white', ylim = c(0,12), ylab = 'log(IL11 average expression)', xlab = 'Condition', main = 'IL11 expression - lung microarray')
boxplot(E3.avg$A['NM_000641',targets_2$FileName] ~ targets_2$Condition, ylim = c(0,11), col = NA, border = 'black', lwd = 1, pch = 21, bg = 'white', xaxt = 'n', yaxt = 'n', outline = F, boxwex = 0.5, add = T)
segments(x0 = 1, x1 = 2, y0 = max(E3.avg$A['NM_000641',targets_2$FileName])+1, y1 = max(E3.avg$A['NM_000641',targets_2$FileName])+1)
    text(x = 1.5, y = max(E3.avg$A['NM_000641',targets_2$FileName])+1.5, labels = paste("FDR = ",formatC(tt_ILD_3[tt_ILD_3$SystematicName == 'NM_000641',9], format = 'e', digits = 3)))

IL11 is significantly up-regulated in IPF lung tissue compared to healthy controls, controlling for age and sex.

Smoker status

Smoker status information is present only in a subset of samples. We subset the targets and run the same analysis:

## Subset to have complete cases of smoker status
targets_smoker <- targets_2[!is.na(targets_2$smoker),]

## Limma analysis: set factors, levels and model matrix
d4.levels = unique(targets_smoker$Condition)
d4.factor = factor(targets_smoker$Condition, levels=d4.levels)
d4.factor = relevel(d4.factor, "CTRL")
d4.age = as.numeric(targets_smoker$Age)

d4.sex = factor(targets_smoker$Sex, levels = unique(targets_smoker$Sex))
d4.sex = gsub(d4.sex, pattern = "1-Male", replacement = "Male")
d4.sex = gsub(d4.sex, pattern = "2-Female", replacement = "Female")
d4.sex = factor(d4.sex)

d4.smoker = as.character(factor(targets_smoker$smoker, levels = unique(targets_smoker$smoker)))
d4.smoker = gsub(d4.smoker, pattern = "1-Current", replacement = "Current")
d4.smoker[grep(d4.smoker, pattern = "^2")] =  "Ever"
d4.smoker = gsub(d4.smoker, pattern = "3-Never", replacement = "Never")
d4.smoker = factor(d4.smoker)
d4.smoker = relevel(d4.smoker, "Never")

d4.design = model.matrix(~ 0 + d4.factor + d4.age + d4.sex + d4.smoker)

## Fit the linear model
fit.4 = lmFit(E3.avg$A[,targets_smoker$FileName], d4.design)

## Contrast between IPF and CTRL
contrast.matrix4 <- makeContrasts("d4.factorILD-d4.factorCTRL", levels=d4.design)
fit.4 <- contrasts.fit(fit.4, contrast.matrix4)

## Apply empirical Bayes shrinkage
fit.4.2 <- eBayes(fit.4)

## Adjust p-values and sort
tt_ILD_4 = topTable(fit.4.2, adjust = 'fdr', coef = 1, genelist = E3.avg$genes, number = Inf)

## Results for IL11
tt_ILD_4[tt_ILD_4$SystematicName == 'NM_000641',]

Even when accounting for smoker status, IL11 is still significantly up-regulated in IPF.

Lung function

As a sanity check, we visualize and test for differences in 5 different lung function measurements between control an IPF. We first want to know whether we should run a t-test or a non-parametric test between CTRL and IPF lung-function values. We will perform 2 normality tests (Anderson-Darling (AD) and Wilkes-Shapiro (WS)). Both these tests assume as null hypothesis that the distribution from which the sample is normal, so the significance is a measure of the departure from normality (also visible in a QQ plot). As a caveat we must remember that these tests do not tell us whether the distribution is normal, just whether we can reject the null hypothesis that the sample comes from a normally distributed population.

## Plot results of normality tests and distributions
layout(matrix(1:6, nrow = 2))
for( i in colnames(targets_2)[13:17])
{   
    x = as.numeric(targets_2[!is.na(targets_2[,i]),i])
    at = nortest::ad.test(x)
    st = stats::shapiro.test(x)
    qqnorm(x, main = paste(i, "values"))
    qqline(x, col = 'red')
    legend('topleft', bty = 'n', legend = c(paste("AD p:", formatC(at$p.value, format = "e", digits = 3)), paste("WS p:", formatC(st$p.value, format = "e", digits = 3))))
    hist(x, breaks = 20, col = 'cadetblue', border = NA, main = paste(i, 'value distribution'))

}

Some sets of lung function values appear to depart slightly from the normal distribution and do not allow us to confidently reject the null hp under the commonly used threshold of p < 0.05. We may then use a non-parametric test in order to gauge the differences between CTRL and IPF for every lung function.

## Plot lung function values and test results
layout(matrix(1:6, nrow = 2))
for( i in colnames(targets_2)[13:17])
{   
    x_ipf = as.numeric(targets_2[!is.na(targets_2[,i]) & targets_2$Ild.subtype == "2-UIP/IPF", i])
    x_ctrl = as.numeric(targets_2[!is.na(targets_2[,i]) & targets_2$Ild.subtype == "CTRL", i])

    npt = wilcox.test(x_ctrl, x_ipf)

    boxplot(x_ipf, x_ctrl, border = c('salmon', 'purple'), lwd = 2, col = NA, main = paste(i, "IPF vs CTRL"), ylab = i, xaxt = 'n', ylim = c(0,max(c(x_ipf, x_ctrl)))+(max(c(x_ipf, x_ctrl))/10), outline = F)
    beeswarm(x_ipf, method = "hex", pch = 21, col = 'black', bg = paste(col2hex('salmon'), '4D', sep = ""), add = T)
    beeswarm(x_ctrl, at = 2, method = "hex", pch = 21, col = 'black', bg = paste(col2hex('purple'), '4D', sep = ""), add = T)

    segments(x0 = 1, x1 = 2, y0 = max(c(x_ipf, x_ctrl))+(max(c(x_ipf, x_ctrl))/20), y1 = max(c(x_ipf, x_ctrl))+(max(c(x_ipf, x_ctrl))/20))
    text(x = 1.5, y = max(c(x_ipf, x_ctrl))+(max(c(x_ipf, x_ctrl))/12), labels = formatC(npt$p.value, format = "e", digits = 3))
    axis(1, at = 1:2, labels = c("IPF", "CTRL"))

}

We can see how all lung functions are significantly different between healthy and IPF patients.

Let’s see what these look like when incorporating smoker status:

layout(matrix(1:6, nrow = 2))
for( i in colnames(targets_2)[13:17]){  

    ## Define target values: i is the lung function 
    x_ipf = targets_smoker[!is.na(targets_smoker[,i]) & targets_smoker$Ild.subtype == "2-UIP/IPF", ]
    x_ctrl = targets_smoker[!is.na(targets_smoker[,i]) & targets_smoker$Ild.subtype == "CTRL", ]
    x_ipf[,i] <- as.numeric(x_ipf[,i])
    x_ctrl[,i] <- as.numeric(x_ctrl[,i])

    ## Plot distributions of values as boxplots
    boxplot(x_ipf[,i]~factor(x_ipf$smoker, levels = unique(x_ipf$smoker)), border = "salmon", lwd = 2, xaxt = "n", col = NA, main = paste(i, "IPF vs CTRL"), ylab = i, ylim = c(-5,max(c(x_ipf[,i], x_ctrl[,i])))+(max(c(x_ipf[,i], x_ctrl[,i]))/10), outline = F, at = 1:length(unique(x_ipf$smoker)), xlim = c(0,7))

    boxplot(x_ctrl[,i] ~ factor(x_ctrl$smoker, levels = unique(x_ctrl$smoker)), border = "purple", lwd = 2, col = NA, outline = F, at = (length(unique(x_ipf$smoker)) + 1) : (length(unique(x_ipf$smoker)) + length(unique(x_ctrl$smoker))), add = T, xaxt = "n")
    
    ## Overlay data points
    beeswarm(x_ipf[,i] ~ factor(x_ipf$smoker, levels = unique(x_ipf$smoker)), method = "hex", pch = 21, cex = 0.7, col = 'black', bg = paste(col2hex('salmon'), '4D', sep = ""), at = 1:length(unique(x_ipf$smoker)), xlim = c(0,7), add = T)

    beeswarm(x_ctrl[,i] ~ factor(x_ctrl$smoker, levels = unique(x_ctrl$smoker)), method = "hex", pch = 21, cex = 0.7, col = 'black', bg = paste(col2hex('purple'), '4D', sep = ""), add = T, at = (length(unique(x_ipf$smoker)) + 1):(length(unique(x_ipf$smoker))+length(unique(x_ctrl$smoker))))


    axis(1, at = 1:(length(unique(x_ipf$smoker)) + length(unique(x_ctrl$smoker))), labels = c(unique(x_ipf$smoker), unique(x_ctrl$smoker)), las = 2)

}

Testing against current smokers is not possible due to lack of data points.

We show the results of Wilcoxon rank sum tests between ever- and never-smokers within patient groups, and between patient groups within smoker status:

for( i in colnames(targets)[13:17]){    

    ## Define target values: i is the lung function
    x_ipf = targets_smoker[!is.na(targets_smoker[,i]) & targets_smoker$Ild.subtype == "2-UIP/IPF", ]
    x_ctrl = targets_smoker[!is.na(targets_smoker[,i]) & targets_smoker$Ild.subtype == "CTRL", ]

## Wilcoxon rank sum test
print(paste(i, "Ever smoker - IPF vs CTRL:"))
print(wilcox.test(as.numeric(x_ipf[x_ipf$smoker == "2-Ever (>100)",i]), as.numeric(x_ctrl[x_ctrl$smoker == "2-Ever (>100)",i]))$p.value)

print(paste(i,"Never smoker - IPF vs CTRL:"))
print(wilcox.test(as.numeric(x_ipf[x_ipf$smoker == "3-Never",i]), as.numeric(x_ctrl[x_ctrl$smoker == "3-Never",i]))$p.value)
}
## [1] "predicted_dlco Ever smoker - IPF vs CTRL:"
## [1] 4.372678e-14
## [1] "predicted_dlco Never smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "3-
## Never", : cannot compute exact p-value with ties
## [1] 1.226289e-08
## [1] "predicted_fev1_postbd Ever smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "2-Ever
## (>100)", : cannot compute exact p-value with ties
## [1] 2.040983e-05
## [1] "predicted_fev1_postbd Never smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "3-
## Never", : cannot compute exact p-value with ties
## [1] 9.796501e-05
## [1] "predicted_fev1_prebd Ever smoker - IPF vs CTRL:"
## [1] 1.242058e-12
## [1] "predicted_fev1_prebd Never smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "3-
## Never", : cannot compute exact p-value with ties
## [1] 1.797282e-06
## [1] "predicted_fvc_postbd Ever smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "2-Ever
## (>100)", : cannot compute exact p-value with ties
## [1] 6.06989e-08
## [1] "predicted_fvc_postbd Never smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "3-
## Never", : cannot compute exact p-value with ties
## [1] 9.296639e-06
## [1] "predicted_fvc_prebd Ever smoker - IPF vs CTRL:"
## [1] 1.597747e-16
## [1] "predicted_fvc_prebd Never smoker - IPF vs CTRL:"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "3-
## Never", : cannot compute exact p-value with ties
## [1] 1.091856e-08

All lung functions are significantly different between IPF and CTRL regardless of smoker status, although the smoker status does increase slightly the significance when the CTRL have never smoked.

Now, within patient group and across smoker status:

for( i in colnames(targets)[13:17]){    

    ## Define target values: i is the lung function
    x_ipf = targets_smoker[!is.na(targets_smoker[,i]) & targets_smoker$Ild.subtype == "2-UIP/IPF", ]
    x_ctrl = targets_smoker[!is.na(targets_smoker[,i]) & targets_smoker$Ild.subtype == "CTRL", ]

## Wilcoxon rank sum test
print(paste(i, "IPF - Ever vs Never smoker"))
print(wilcox.test( as.numeric(x_ipf[x_ipf$smoker == "2-Ever (>100)",i]), as.numeric(x_ipf[x_ipf$smoker == "3-Never",i]))$p.value)

print(paste(i,"CTRL - Ever vs Never smoker"))
print(wilcox.test(as.numeric(x_ctrl[x_ctrl$smoker == "2-Ever (>100)",i]), as.numeric(x_ctrl[x_ctrl$smoker == "3-Never",i]))$p.value)
}
## [1] "predicted_dlco IPF - Ever vs Never smoker"
## [1] 0.4818604
## [1] "predicted_dlco CTRL - Ever vs Never smoker"
## [1] 0.05185144
## [1] "predicted_fev1_postbd IPF - Ever vs Never smoker"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "2-Ever
## (>100)", : cannot compute exact p-value with ties
## [1] 0.3996687
## [1] "predicted_fev1_postbd CTRL - Ever vs Never smoker"
## Warning in wilcox.test.default(as.numeric(x_ctrl[x_ctrl$smoker == "2-Ever
## (>100)", : cannot compute exact p-value with ties
## [1] 0.2749829
## [1] "predicted_fev1_prebd IPF - Ever vs Never smoker"
## [1] 0.9010611
## [1] "predicted_fev1_prebd CTRL - Ever vs Never smoker"
## [1] 0.5360915
## [1] "predicted_fvc_postbd IPF - Ever vs Never smoker"
## Warning in wilcox.test.default(as.numeric(x_ipf[x_ipf$smoker == "2-Ever
## (>100)", : cannot compute exact p-value with ties
## [1] 0.3852861
## [1] "predicted_fvc_postbd CTRL - Ever vs Never smoker"
## Warning in wilcox.test.default(as.numeric(x_ctrl[x_ctrl$smoker == "2-Ever
## (>100)", : cannot compute exact p-value with ties
## [1] 0.4905138
## [1] "predicted_fvc_prebd IPF - Ever vs Never smoker"
## [1] 0.631279
## [1] "predicted_fvc_prebd CTRL - Ever vs Never smoker"
## [1] 0.8269015

There is never a significant difference within patient group, across smoker status, for any function.

Linear models - checking covariates

We now want to know whether there is a statistically significant relationship between gene expression and age/sex, and smoker status. It can always be accounted for, but it is interesting to know how much this relationship weighs in on the rest of the analyses. Since we are mostly interested in IL11 in IPF, we will use its expression values to perform these tests. We will first do it checking age and sex, and add the smoker status later on.

## Define expression matrix
targets_only_IPF = targets_2[targets_2$Condition == "ILD",]
targets_only_IPF = targets_only_IPF[targets_only_IPF$Ild.subtype == "2-UIP/IPF",]
cor_targets = targets_only_IPF
Evalues = E3.avg$A[,cor_targets$FileName]


## Get expression values for IL11
j = "NM_000641"
expr_values = Evalues[j,]

## Get covariate levels
age = as.numeric(cor_targets$Age)[order(expr_values)]
sex2 = cor_targets$Sex[order(expr_values)]
sex = factor(sex2, levels = c("1-Male", "2-Female"))
cols = cor_targets$col_ipf[order(expr_values)]
expr_values = expr_values[order(expr_values)]

x <- expr_values

## Fit different linear models
fit1 <- lm(x ~ age)
fit2 <- lm(x ~ sex)
fit3 <- lm(x ~ age + sex)
fit4 <- lm(x ~ age + sex + age*sex)


fits <- list("expression_age" = fit1, "expression_sex" = fit2, "expression_age+sex" = fit3, "expression_age_sex_interaction" = fit4)

## Get residuals and test for normality, then plot results
for(l in 1:length(fits)){
    rr <- residuals(fits[[l]])
    at = nortest::ad.test(rr)
    st = stats::shapiro.test(rr)

    layout(matrix(1:4, nrow = 2))
    qqnorm(rr, main = paste(names(fits)[l], "residuals QQ plot"))
    qqline(rr, col = 'red')
    legend('topleft', bty = 'n', legend = c(paste("AD p:", formatC(at$p.value, format = "e", digits = 3)), paste("WS p:", formatC(st$p.value, format = "e", digits = 3))))
    hist(rr, breaks = 20, col = 'cadetblue', border = NA, main =  'LM residuals')
    plot(fits[[l]]$fitted.values, rr, pch = 16, ylab = "residuals", xlab = "fitted values", main = "Residuals vs fitted values")
            }

The residuals show that the data is heteroscedastic. The histogram looks normal, but if we want to be strict according to normality tests we cannot apply a linear model since the assumption of normality of residuals may be violated. We can, however, apply the Box-Cox transformation to gene expression values to make them conform more to the normal distribution, and see what happens to the standardized residuals:

## Box-Cox transformation
x_bc <- predict(caret::BoxCoxTrans(x),x)

## Fit the Box-Cox transformed values using the interaction model
fit5 = lm(x_bc ~ age + sex + age*sex)
## Get standardized residuals

rr = MASS::stdres(fit5)

## Normality tests
at = nortest::ad.test(rr)
st = stats::shapiro.test(rr)

## Plot results
layout(matrix(1:4, nrow = 2))
            qqnorm(rr, main = paste(i, "values"))
            qqline(rr, col = 'red')
            legend('topleft', bty = 'n', legend = c(paste("AD p:", formatC(at$p.value, format = "e", digits = 3)), paste("WS p:", formatC(st$p.value, format = "e", digits = 3))))
            hist(rr, breaks = 20, col = 'cadetblue', border = NA, main =  'LM residuals')
            plot(fit5$fitted.values, rr, pch = 16, ylab = "residuals", xlab = "fitted values", main = "Residuals vs fitted values")

We can now fit a linear model between Box-Cox-transformed gene expression and the two covariates, see how much they weigh in on the relationship:

    summary(lm(x_bc ~ age))
## 
## Call:
## lm(formula = x_bc ~ age)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0134873 -0.0022693 -0.0001712  0.0023164  0.0102230 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.895e-01  2.828e-03  173.09   <2e-16 ***
## age         -9.263e-05  4.348e-05   -2.13   0.0352 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.004016 on 120 degrees of freedom
## Multiple R-squared:  0.03645,    Adjusted R-squared:  0.02842 
## F-statistic: 4.539 on 1 and 120 DF,  p-value: 0.03517
    summary(lm(x_bc ~ sex))
## 
## Call:
## lm(formula = x_bc ~ sex)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0112806 -0.0023477 -0.0000814  0.0020214  0.0116886 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  0.4836111  0.0004545 1064.079   <2e-16 ***
## sex2-Female -0.0001835  0.0007840   -0.234    0.815    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00409 on 120 degrees of freedom
## Multiple R-squared:  0.0004561,  Adjusted R-squared:  -0.007873 
## F-statistic: 0.05476 on 1 and 120 DF,  p-value: 0.8154
    summary(lm(x_bc ~ age + sex))
## 
## Call:
## lm(formula = x_bc ~ age + sex)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0133148 -0.0023031 -0.0000328  0.0022443  0.0104049 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.897e-01  2.875e-03 170.333   <2e-16 ***
## age         -9.382e-05  4.374e-05  -2.145    0.034 *  
## sex2-Female -3.001e-04  7.744e-04  -0.387    0.699    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00403 on 119 degrees of freedom
## Multiple R-squared:  0.03766,    Adjusted R-squared:  0.02149 
## F-statistic: 2.329 on 2 and 119 DF,  p-value: 0.1019
    summary(lm(x_bc ~ age + sex + age*sex))
## 
## Call:
## lm(formula = x_bc ~ age + sex + age * sex)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0146818 -0.0023062  0.0000864  0.0022414  0.0095423 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.871e-01  3.650e-03 133.448   <2e-16 ***
## age             -5.400e-05  5.580e-05  -0.968    0.335    
## sex2-Female      6.300e-03  5.806e-03   1.085    0.280    
## age:sex2-Female -1.029e-04  8.968e-05  -1.147    0.254    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.004025 on 118 degrees of freedom
## Multiple R-squared:  0.04827,    Adjusted R-squared:  0.02408 
## F-statistic: 1.995 on 3 and 118 DF,  p-value: 0.1185

There is only a slight statistically significant relationship between gene expression and age (Adjusted Rsquared 0.02842), but when considering both age and sex in combination - with and without interaction - the relationship is not significant. Importantly, this was also the case when fitting a linear model without transforming the data:

            summary(lm(x ~ age))
## 
## Call:
## lm(formula = x ~ age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0795 -0.4817 -0.1690  0.3064  4.0295 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.510778   0.622190  12.072  < 2e-16 ***
## age         -0.028438   0.009565  -2.973  0.00356 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8835 on 120 degrees of freedom
## Multiple R-squared:  0.06861,    Adjusted R-squared:  0.06085 
## F-statistic: 8.839 on 1 and 120 DF,  p-value: 0.003564
            summary(lm(x ~ sex))
## 
## Call:
## lm(formula = x ~ sex)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4986 -0.5095 -0.2001  0.2529  4.3828 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.64630    0.10161  55.567   <2e-16 ***
## sex2-Female  0.08923    0.17528   0.509    0.612    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9145 on 120 degrees of freedom
## Multiple R-squared:  0.002155,   Adjusted R-squared:  -0.00616 
## F-statistic: 0.2592 on 1 and 120 DF,  p-value: 0.6116
            summary(lm(x ~ age + sex))
## 
## Call:
## lm(formula = x ~ age + sex)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1106 -0.4913 -0.1648  0.3161  3.9967 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.478726   0.632625  11.822  < 2e-16 ***
## age         -0.028223   0.009625  -2.932  0.00404 ** 
## sex2-Female  0.054152   0.170403   0.318  0.75120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8869 on 119 degrees of freedom
## Multiple R-squared:  0.0694, Adjusted R-squared:  0.05376 
## F-statistic: 4.437 on 2 and 119 DF,  p-value: 0.01385
            summary(lm(x ~ age + sex + age*sex))
## 
## Call:
## lm(formula = x ~ age + sex + age * sex)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6791 -0.4932 -0.1289  0.2746  3.6379 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.40336    0.79162   8.089 6.01e-13 ***
## age             -0.01166    0.01210  -0.964   0.3372    
## sex2-Female      2.79937    1.25912   2.223   0.0281 *  
## age:sex2-Female -0.04278    0.01945  -2.200   0.0298 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8729 on 118 degrees of freedom
## Multiple R-squared:  0.1061, Adjusted R-squared:  0.08333 
## F-statistic: 4.667 on 3 and 118 DF,  p-value: 0.004059

Adding smoker status, in IPF:

targets_only_IPF_smoker = targets_smoker[targets_smoker$Condition == "ILD",]
targets_only_IPF_smoker = targets_only_IPF_smoker[targets_only_IPF_smoker$Ild.subtype == "2-UIP/IPF",]
cor_targets_smoker = targets_only_IPF_smoker
Evalues = E3.avg$A[,cor_targets_smoker$FileName]


## Get expression values for IL11
j = "NM_000641"
expr_values = Evalues[j,]

## Get covariate levels
age = as.numeric(cor_targets$Age)[order(expr_values)]
sex2 = cor_targets$Sex[order(expr_values)]
sex = factor(sex2, levels = c("1-Male", "2-Female"))
cols = cor_targets$col_ipf[order(expr_values)]
expr_values = expr_values[order(expr_values)]
smoker2 <- cor_targets_smoker$smoker[order(expr_values)]
smoker = factor(smoker2, levels = c("1-Current", "2-Ever (>100)", "3-Never"))

x <- expr_values

## Fit different linear models

summary(lm(x ~ smoker))
## 
## Call:
## lm(formula = x ~ smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5735 -0.5843 -0.1758  0.2232  4.3080 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.3105     0.9208   6.854 3.79e-10 ***
## smoker2-Ever (>100)  -0.7092     0.9270  -0.765    0.446    
## smoker3-Never        -0.5001     0.9314  -0.537    0.592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9208 on 115 degrees of freedom
## Multiple R-squared:  0.01599,    Adjusted R-squared:  -0.001119 
## F-statistic: 0.9346 on 2 and 115 DF,  p-value: 0.3957
summary(lm(x ~ age + sex + smoker))
## 
## Call:
## lm(formula = x ~ age + sex + smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5305 -0.5391 -0.1509  0.2231  4.3386 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.07352    1.12068   6.312 5.57e-09 ***
## age                 -0.01231    0.01025  -1.200    0.233    
## sex2-Female         -0.05008    0.18280  -0.274    0.785    
## smoker2-Ever (>100) -0.66281    0.93157  -0.711    0.478    
## smoker3-Never       -0.45695    0.93556  -0.488    0.626    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9229 on 113 degrees of freedom
## Multiple R-squared:  0.02867,    Adjusted R-squared:  -0.005711 
## F-statistic: 0.8339 on 4 and 113 DF,  p-value: 0.5064
summary(lm(x ~ age + smoker))
## 
## Call:
## lm(formula = x ~ age + smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5164 -0.5615 -0.1583  0.2107  4.3530 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.06078    1.11517   6.332 4.96e-09 ***
## age                 -0.01210    0.01019  -1.188    0.237    
## smoker2-Ever (>100) -0.68022    0.92563  -0.735    0.464    
## smoker3-Never       -0.47256    0.93003  -0.508    0.612    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9191 on 114 degrees of freedom
## Multiple R-squared:  0.02803,    Adjusted R-squared:  0.002449 
## F-statistic: 1.096 on 3 and 114 DF,  p-value: 0.354
summary(lm(x ~ sex + smoker))
## 
## Call:
## lm(formula = x ~ sex + smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5838 -0.5881 -0.1697  0.2245  4.2977 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.31054    0.92465   6.825 4.49e-10 ***
## sex2-Female         -0.03402    0.18266  -0.186    0.853    
## smoker2-Ever (>100) -0.69767    0.93292  -0.748    0.456    
## smoker3-Never       -0.48985    0.93697  -0.523    0.602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9246 on 114 degrees of freedom
## Multiple R-squared:  0.01629,    Adjusted R-squared:  -0.009593 
## F-statistic: 0.6294 on 3 and 114 DF,  p-value: 0.5975
summary(lm(x ~ age + sex + age*sex*smoker))
## 
## Call:
## lm(formula = x ~ age + sex + age * sex * smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5304 -0.5688 -0.1380  0.2223  4.3375 
## 
## Coefficients: (3 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                          7.15176    1.75495   4.075 8.76e-05
## age                                 -0.01357    0.02409  -0.563    0.574
## sex2-Female                          4.71014    2.93414   1.605    0.111
## smoker2-Ever (>100)                 -1.39440    2.03197  -0.686    0.494
## smoker3-Never                       -0.44829    0.93829  -0.478    0.634
## age:sex2-Female                     -0.07423    0.04535  -1.637    0.105
## age:smoker2-Ever (>100)              0.01124    0.02870   0.392    0.696
## age:smoker3-Never                         NA         NA      NA       NA
## sex2-Female:smoker2-Ever (>100)     -4.36845    3.31935  -1.316    0.191
## sex2-Female:smoker3-Never                 NA         NA      NA       NA
## age:sex2-Female:smoker2-Ever (>100)  0.06851    0.05133   1.335    0.185
## age:sex2-Female:smoker3-Never             NA         NA      NA       NA
##                                        
## (Intercept)                         ***
## age                                    
## sex2-Female                            
## smoker2-Ever (>100)                    
## smoker3-Never                          
## age:sex2-Female                        
## age:smoker2-Ever (>100)                
## age:smoker3-Never                      
## sex2-Female:smoker2-Ever (>100)        
## sex2-Female:smoker3-Never              
## age:sex2-Female:smoker2-Ever (>100)    
## age:sex2-Female:smoker3-Never          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9215 on 109 degrees of freedom
## Multiple R-squared:  0.0658, Adjusted R-squared:  -0.002764 
## F-statistic: 0.9597 on 8 and 109 DF,  p-value: 0.4714

There does not seem to be any linear relationship with smoker status, nor any increase in significance when adding smoker status in IPF.

Checking in CTRL:

targets_only_CTRL_smoker = targets_smoker[targets_smoker$Condition == "CTRL",]
targets_only_CTRL_smoker = targets_only_CTRL_smoker[targets_only_CTRL_smoker$Ild.subtype == "CTRL",]
cor_targets_smoker = targets_only_CTRL_smoker
Evalues = E3.avg$A[,cor_targets_smoker$FileName]


## Get expression values for IL11
j = "NM_000641"
expr_values = Evalues[j,]

## Get covariate levels
age = as.numeric(cor_targets$Age)[order(expr_values)]
sex2 = cor_targets$Sex[order(expr_values)]
sex = factor(sex2, levels = c("1-Male", "2-Female"))
cols = cor_targets$col_ipf[order(expr_values)]
expr_values = expr_values[order(expr_values)]
smoker2 <- cor_targets_smoker$smoker[order(expr_values)]
smoker = factor(smoker2, levels = c("1-Current", "2-Ever (>100)", "3-Never"))

x <- expr_values

## Fit different linear models

summary(lm(x ~ smoker))
## 
## Call:
## lm(formula = x ~ smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1233 -0.3497 -0.1435  0.1774  3.0331 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          4.77812    0.46686  10.235 5.17e-16 ***
## smoker2-Ever (>100)  0.19569    0.47542   0.412    0.682    
## smoker3-Never        0.07188    0.48592   0.148    0.883    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6602 on 77 degrees of freedom
## Multiple R-squared:  0.00895,    Adjusted R-squared:  -0.01679 
## F-statistic: 0.3477 on 2 and 77 DF,  p-value: 0.7074
summary(lm(x ~ age + sex + smoker))
## 
## Call:
## lm(formula = x ~ age + sex + smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1174 -0.4050 -0.1395  0.1919  2.8248 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.492098   0.763755   7.191 4.07e-10 ***
## age                 -0.010712   0.008643  -1.239    0.219    
## sex2-Female         -0.006979   0.164821  -0.042    0.966    
## smoker2-Ever (>100)  0.172070   0.492690   0.349    0.728    
## smoker3-Never        0.035669   0.497761   0.072    0.943    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6622 on 75 degrees of freedom
## Multiple R-squared:  0.02886,    Adjusted R-squared:  -0.02294 
## F-statistic: 0.5571 on 4 and 75 DF,  p-value: 0.6944
summary(lm(x ~ age + smoker))
## 
## Call:
## lm(formula = x ~ age + smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1156 -0.4044 -0.1414  0.1939  2.8270 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.48364    0.73233   7.488 1.05e-10 ***
## age                 -0.01069    0.00857  -1.247    0.216    
## smoker2-Ever (>100)  0.17728    0.47395   0.374    0.709    
## smoker3-Never        0.03981    0.48486   0.082    0.935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6579 on 76 degrees of freedom
## Multiple R-squared:  0.02883,    Adjusted R-squared:  -0.009503 
## F-statistic: 0.7521 on 3 and 76 DF,  p-value: 0.5245
summary(lm(x ~ sex + smoker))
## 
## Call:
## lm(formula = x ~ sex + smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1219 -0.3538 -0.1444  0.1791  3.0345 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.772616   0.498071   9.582 1.03e-14 ***
## sex2-Female         0.005508   0.165092   0.033    0.973    
## smoker2-Ever (>100) 0.199768   0.493916   0.404    0.687    
## smoker3-Never       0.075089   0.498493   0.151    0.881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6646 on 76 degrees of freedom
## Multiple R-squared:  0.008965,   Adjusted R-squared:  -0.03015 
## F-statistic: 0.2292 on 3 and 76 DF,  p-value: 0.8758
summary(lm(x ~ age + sex + age*sex*smoker))
## 
## Call:
## lm(formula = x ~ age + sex + age * sex * smoker)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11437 -0.36220  0.00044  0.23340  2.24475 
## 
## Coefficients: (2 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                         -2.51369    5.39537  -0.466   0.6427  
## age                                  0.11086    0.08210   1.350   0.1813  
## sex2-Female                          2.64832    2.35381   1.125   0.2644  
## smoker2-Ever (>100)                 10.06968    5.44667   1.849   0.0687 .
## smoker3-Never                        4.93643    5.03582   0.980   0.3303  
## age:sex2-Female                     -0.04051    0.03707  -1.093   0.2782  
## age:smoker2-Ever (>100)             -0.15072    0.08289  -1.818   0.0733 .
## age:smoker3-Never                   -0.07319    0.07628  -0.959   0.3407  
## sex2-Female:smoker2-Ever (>100)     -6.71267    2.66657  -2.517   0.0141 *
## sex2-Female:smoker3-Never                 NA         NA      NA       NA  
## age:sex2-Female:smoker2-Ever (>100)  0.10330    0.04183   2.470   0.0160 *
## age:sex2-Female:smoker3-Never             NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6216 on 70 degrees of freedom
## Multiple R-squared:  0.2014, Adjusted R-squared:  0.09872 
## F-statistic: 1.962 on 9 and 70 DF,  p-value: 0.05705

There does not seem to be any linear relationship with smoker status, nor any increase in significance when adding smoker status in CTRL either.

Considering that there is no apparent benefit in including smoke, and lung functions are adjusted for age and sex only, the analysis would require multiple adjustments just to include smoker status with no obvious advantage. For this reason we will leave it out of the following analyses.

We then want to know whether there is any relationship between lung function and 3 IL6 family cytokines (IL11, IL6 and OSM), plus IL13, which was implicated in IPF pathogenesis. To do so we will do two types of analysis: fitting a linear model - within each patient class - in which log2(gene expression) is the predictor variable and lung function is the predicted variable, and Spearman correlations. However, since lung function is already adjusted for age and sex, we cannot just model the linear relationship adjusting both variables for those two covariates - lung function would be adjusted twice. So we first calculate the residuals of a linear model for log(gene expression), which, summed to the intercept of the model, will give us the “age- and sex-adjusted gene expression”. Then, a simple linear model between lung function and adjusted gene expression of the model can be fit in order to draw the precise trendline of the scatterplot between lung function and gene expression.

In order to know whether the assumptions for a linear model are met, we need again to know whether the residuals of the model are likely normally distributed.

We will do so using IL11 in IPF samples:

layout(matrix(1:6, nrow = 2))
targets_only_IPF = targets_2[targets_2$Condition == "ILD",]
targets_only_IPF = targets_only_IPF[targets_only_IPF$Ild.subtype == "2-UIP/IPF",]

for( i in colnames(targets)[13:17]){        

            ## Subset for samples with complete cases of each lung function i
            cor_targets = targets_only_IPF[which(!is.na(targets_only_IPF[,i])),]

            ## Get the expression matrix for these samples
            Evalues = E3.avg$A[,cor_targets$FileName]

            ## IL11 expression
            j = "NM_000641"
            expr_values = Evalues[j,]

            ## Lung function values
            lungf_values = as.numeric(cor_targets[,i])

            age = as.numeric(cor_targets$Age)[order(expr_values)]
            sex2 = cor_targets$Sex[order(expr_values)]
            sex = factor(sex2, levels = c("1-Male", "2-Female"))
            cols = cor_targets$col_ipf[order(expr_values)]
            lungf_values = lungf_values[order(expr_values)]
            expr_values = expr_values[order(expr_values)]

            x <- expr_values
            y <- lungf_values

            ## Fit first lm to adjust gene expression
            fit1 <- lm(x ~ age + sex)
            res1 = residuals(fit1)

            ## Adjusted IL11 expression: residuals + coefficient
            adjx = res1 + fit1$coefficients[1]

            ## Fit lm with covariates
            fit2 <- lm(y ~ adjx)


            ## Extract residuals
            res2 = residuals(fit2)

            ## Normality tests
            at = nortest::ad.test(res2)
            st = stats::shapiro.test(res2)

            ## QQ plot
            qqnorm(res2, main = paste(i, "LM residuals"))
            qqline(res2, col = 'red')
            legend('topleft', bty = 'n', legend = c(paste("AD p:", formatC(at$p.value, format = "e", digits = 3)), paste("WS p:", formatC(st$p.value, format = "e", digits = 3))))
            hist(res2, breaks = 20, col = 'slateblue', border = NA, main = paste(i, 'LM residuals'))

        
}

In this case, only the residuals for DLCO % predicted seem to be departing significantly from a normal distribution. With a sample size this big (> 100 samples), however, we should still be able to fit a linear model. It is likely that the tails have thrown off the test. We can also determine normality in a more coarse way by looking at their histogram and notice how there is a tail of high residuals on the right-hand side. Some measurements are more sparse than others because not all lung functions were measured in all patients, or in the same set of patients.

Given the presence of outliers, we can fit a robust linear model to confirm the relationship between IL11 and DLCO % predicted:

            cor_targets = targets_only_IPF[which(!is.na(targets_only_IPF[,13])),]
            lungf_values <- as.numeric(cor_targets[,13])
            Evalues = E3.avg$A[,cor_targets$FileName]
            j = "NM_000641"

            expr_values = Evalues[j,]
            age = as.numeric(cor_targets$Age)[order(expr_values)]
            sex2 = cor_targets$Sex[order(expr_values)]
            sex = factor(sex2, levels = c("1-Male", "2-Female"))
            lungf_values = lungf_values[order(expr_values)]
            expr_values = expr_values[order(expr_values)]
            x <- expr_values
            y <- lungf_values

            ## Get adjusted values
            fit1 <- lm(x ~ age + sex)

            res1 = residuals(fit1)

            adjx = res1 + fit1$coefficients[1]
            
            ## Fit robust linear model
            fit3 <- robust::lmRob(y ~ adjx)

summary(fit3)
## 
## Call:
## robust::lmRob(formula = y ~ adjx)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -40.7438  -7.3848   0.3372   9.5612  45.7766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  116.857     13.674   8.546 9.15e-14 ***
## adjx          -9.687      1.895  -5.111 1.40e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.14 on 108 degrees of freedom
## Multiple R-Squared: 0.1751 
## 
## Test for Bias:
##             statistic  p-value
## M-estimate      1.028 0.598237
## LS-estimate    11.678 0.002912

The relationship is significant also for a robust model.

Correlation analysis

Correlation with lung function - IPF

In order to find a correlation between lung function and gen expression we need to calculate semi-partial correlations between lung function and log2(gene expression). Semi-partial correlations will only be corrected for gene expression and not for lung function (which is already adjusted). Moreover, as demonstrated earlier, we fit a robust linear model and a normal linear model between adjusted gene expression and lung function, and calculate the 95% confidence interval on the coefficient by bootstrapping.

dofour = TRUE

## Define targets: 4 IL-6 family cytokines
cytokines = c('IL11','IL13','OSM','IL6')

## Get the refseq ID for the transcripts that are present in this chip layout
NM_in_chip_ck = intersect(bm[bm$hgnc_symbol %in% cytokines,2], rownames(E3.avg$A))

## Small data frame containing refseq IDs, ensembl IDs and gene symbols
ck_nm = bm[bm$refseq_mrna %in% NM_in_chip_ck,]

fibroid = cytokines
fibroid_in_chip = intersect(bm[bm$hgnc_symbol %in% fibroid,2], rownames(E3.avg$A))
fibroid_in_chip = bm[bm$refseq_mrna %in% fibroid_in_chip,]

## Subset targets according to IPF classification
targets_only_IPF = targets_2[targets_2$Condition == "ILD",]
targets_only_IPF = targets_only_IPF[targets_only_IPF$Ild.subtype == "2-UIP/IPF",]

cordata_ipf = list()

## Long loop in which statistics will be evaluated, plots saved to PDF and a table with these statistics will be generated

for( i in colnames(targets)[13:17])
{   
    cordata_ipf[[i]] = list()
    cor_targets = targets_only_IPF[which(!is.na(targets_only_IPF[,i])),]
    Evalues = E3.avg$A[,cor_targets$FileName]
    resdf = matrix(ncol = 9)

    for (j in fibroid_in_chip$refseq_mrna)
        {

            expr_values = Evalues[j,]
            lungf_values = as.numeric(cor_targets[,i])

            cor_targets$col_ipf = cor_targets$Ild.subtype
            cor_targets$col_ipf = gsub(cor_targets$col_ipf, pattern = "2-UIP/IPF", replacement = "#27AAE1")

            ## Covariates
            age = as.numeric(cor_targets$Age)[order(expr_values)]
            sex2 = cor_targets$Sex[order(expr_values)]
            sex = factor(sex2, levels = c("1-Male", "2-Female"))
            cols = cor_targets$col_ipf[order(expr_values)]
            lungf_values = lungf_values[order(expr_values)]
            expr_values = expr_values[order(expr_values)]
            sex2[sex2 == "1-Male"] = 0
            sex2[sex2 == "2-Female"] = 1

            ## Save covariate values in a data frane for easier referencing
            ddf = data.frame("expr" = as.numeric(expr_values), "func" = as.numeric(lungf_values), "age" = as.numeric(age), "sex" = as.numeric(sex2))

            ## First lm to generate adjusted gene expression values
            x <- ddf$expr
            y <- lungf_values

            fit1 <- lm(x ~ age + sex)
            res1 = residuals(fit1)
            adjx = res1 + fit1$coefficients[1]

        
            ## Fit lm with adjusted expression values
            fit2 <- lm(y ~ adjx)

            ## Fit robust lm with adjusted expression values
            fit3 <- robust::lmRob(y ~ adjx)

            flmp = summary(fit2)$coefficients[2,4]
            flmb = summary(fit2)$coefficients[2,1]
            rlmb = summary(fit3)$coefficients[2,1]

            ddf2 = data.frame("epxr" = as.numeric(adjx), "func" = as.numeric(lungf_values))

            ## Semipartial Spearman correlation test. Also gives confidence intervals. In this correlation test we use unadjusted values, because covariate values are accounted for in the semipartial correlation. 
            spct = RVAideMemoire::pcor.test(x = ddf$func, y = ddf$expr, z = ddf[,3:4], method = 'spearman', semi = TRUE, conf.level = 0.95)
            corrho = spct$estimate
            corp = spct$p.value
            corconf = spct$conf.int

            lmboot = function(data, b)
            {  
                d = data[b,]
                return(lm(d[,2]~d[,1], data = d)$coef)  
            }

            ## To bootstrap the linear model coefficients we use the adjusted values, as done before
            bootst_coefs = boot::boot(data = ddf2, statistic=lmboot, R=5000)
            boot_ci = car::Confint(bootst_coefs, level = 0.95)

            ## Collate results together
            allres = c(flmp, flmb, corrho, corp, corconf[1], corconf[2], rlmb, boot_ci[2,1], boot_ci[2,2])
            resdf = rbind(resdf, allres)
            colnames(resdf) = c("Full_lm_pval", "Full_lm_beta1", "Spearman_semipartial_rho", "Spearman_semipartial_pval", "Spearman_95CI_Inf", "Spearman_95CI_Sup","Robust_lm_beta", "Beta_95CI_Inf", "Beta_95CI_Sup")

            ## Make and save plots
            pdf(file = paste("./Lung_Function_Correlations/",i, "/", fibroid_in_chip[fibroid_in_chip$refseq_mrna == j,3],"_vs_", i, "_IPFonly.pdf",sep=""), useDingbats = F, width = 7, height = 7)
            plot(adjx, y, pch = 16, cex = 0.7, xlab = paste('Age- and sex- adjusted log2(', fibroid_in_chip[fibroid_in_chip$refseq_mrna == j,3], ')', sep = ""), col = cols , ylab = i, main = paste(fibroid_in_chip[fibroid_in_chip$refseq_mrna == j,3]," and ", i, sep=""), las = 1)
            abline(fit2$coefficients[1:2], col='red')
            legend('topright', bty = 'n', legend = c(paste("Correlation [95% CI] = ", as.numeric(round(corrho, digits = 3)), " [", round(corconf[1],3),", ",round(corconf[2],3), "]", sep = ''), paste("Correlation p =", formatC(corp, digits = 3, format = "e"), sep = " "), paste("LM p =", formatC(flmp, digits = 3, format = "e"), sep = " "), paste("LM beta [95% CI] = ", round(flmb, digits = 3), "[", round(boot_ci[2,1], digits = 3), ", ", round(boot_ci[2,2], digits = 3), "]", sep = "")), cex = 0.7)
            dev.off()

        }

        ## Write table with the results
        cordata_ipf[[i]] = as.data.frame(resdf[2:nrow(resdf),])
        rownames(cordata_ipf[[i]]) = fibroid_in_chip$refseq_mrna
        cordata_ipf[[i]]$gene = fibroid_in_chip$hgnc_symbol
        write.table(file = paste("./Lung_Function_Correlations/", i, "/", i, "_IPF_LungFunction_correlations_table.txt",sep = ""), cordata_ipf[[i]], quote = F, sep = "\t")

        dofour = FALSE
}

We now generate the bubble map for these cytokines in IPF:

alldf_ipf = do.call(cbind, cordata_ipf)
pdf_ipf = alldf_ipf[,seq(4, (10*length(cordata_ipf)), by = 10)]
cordf_ipf = alldf_ipf[,seq(3, (10*length(cordata_ipf)), by = 10)]
padjdf_ipf = as.data.frame(t(apply(pdf_ipf, 1, FUN = function(x) p.adjust(x, method = 'fdr'))))
colnames(cordf_ipf) = c("DLCO", "FEV1_postBD", "FEV1_preBD", "FVC_postBD", "FVC_preBD")
rownames(fibroid_in_chip) = fibroid_in_chip$refseq_mrna
rownames(cordf_ipf) = fibroid_in_chip[rownames(cordf_ipf), 3]
par(mar = c(5,2,2,6))
bubbleMap(cordf_ipf, padjdf_ipf, color_pal = viridis(25, option = "C"), main = "Semi-partial correlation with lung function in IPF", cex.binned = F, cbins = 5)
## Using func as id variables


IL11 is the most negatively correlated with DLCO % predicted, which is the most important index for IPF severity. A low DLCO means high disease severity, so a high negative correlation means that there is an inverse relationship with lung function and IL11 expression.

Correlation with lung function - CTRL

We do the same analyis in the control, trying to understand whether the same correlations hold true for healthy patients.

dofive = TRUE

targets_only_CTRL = targets_2[targets_2$Condition == "CTRL",]
targets_only_CTRL = targets_only_CTRL[targets_only_CTRL$Ild.subtype == "CTRL",]

cordata_ctrl = list()
for( i in colnames(targets)[13:17])
{   
    cordata_ctrl[[i]] = list()
    cor_targets = targets_only_CTRL[which(!is.na(targets_only_CTRL[,i])),]
    Evalues = E3.avg$A[,cor_targets$FileName]
    resdf = matrix(ncol = 9)
    for (j in fibroid_in_chip$refseq_mrna)
        {

            expr_values = Evalues[j,]
            lungf_values = as.numeric(cor_targets[,i])

            cor_targets$col_ipf = cor_targets$Ild.subtype
            cor_targets$col_ipf = gsub(cor_targets$col_ipf, pattern = "CTRL", replacement = "#606161")

            age = as.numeric(cor_targets$Age)[order(expr_values)]
            sex2 = cor_targets$Sex[order(expr_values)]
            sex = factor(sex2, levels = c("1-Male", "2-Female"))
            cols = cor_targets$col_ipf[order(expr_values)]
            lungf_values = lungf_values[order(expr_values)]
            expr_values = expr_values[order(expr_values)]
            sex2[sex2 == "1-Male"] = 0
            sex2[sex2 == "2-Female"] = 1

            ddf = data.frame("expr" = as.numeric(expr_values), "func" = as.numeric(lungf_values), "age" = as.numeric(age), "sex" = as.numeric(sex2))

            x <- ddf$expr
            y <- lungf_values

            ## Get adjusted expression values
            fit1 <- lm(x ~ age + sex)
            res1 = residuals(fit1)
            adjx = res1 + fit1$coefficients[1]
            
        
            ## Fit lm without covariates
            fit2 <- lm(y ~ adjx)

            ## Fit robust lm
            fit3 <- robust::lmRob(y ~ adjx)


            flmp = summary(fit2)$coefficients[2,4]
            flmb = summary(fit2)$coefficients[2,1]
            rlmb = summary(fit3)$coefficients[2,1]

            ddf2 = data.frame("epxr" = as.numeric(adjx), "func" = as.numeric(lungf_values))
            
            lmboot = function(data, b)
            {  
                d = data[b,]
                return(lm(d[,2]~d[,1], data = d)$coef)  
            }

            bootst_coefs = boot::boot(data = ddf2, statistic=lmboot, R=5000)
            boot_ci = car::Confint(bootst_coefs, level = 0.95)

            
            spct = RVAideMemoire::pcor.test(x = ddf$func, y = ddf$expr, z = ddf[,3:4], method = 'spearman', semi = TRUE, conf.level = 0.95)
            corrho = spct$estimate
            corp = spct$p.value
            corconf = spct$conf.int


            allres = c(flmp, flmb, corrho, corp, corconf[1], corconf[2], rlmb, boot_ci[2,1], boot_ci[2,2])
            resdf = rbind(resdf, allres)
            colnames(resdf) = c("Full_lm_pval", "Full_lm_beta", "Spearman_semipartial_rho", "Spearman_semipartial_pval", "Spearman_95CI_Inf", "Spearman_95CI_Sup", "Robust_lm_beta", "Beta_95CI_Inf", "Beta_95CI_Sup")
            pdf(file = paste("./Lung_Function_Correlations/",i, "/", i, fibroid_in_chip[fibroid_in_chip$refseq_mrna == j,3],"_vs_", i, "_CTRLonly.pdf",sep=""), useDingbats = F, width = 7, height = 7)
            plot(adjx, y, pch = 16, cex = 0.7, xlab = paste('Age- and sex- adjusted log2(', fibroid_in_chip[fibroid_in_chip$refseq_mrna == j,3], ')', sep = ""), col = cols , ylab = i, main = paste(fibroid_in_chip[fibroid_in_chip$refseq_mrna == j,3]," and ", i, sep=""), las = 1)
            abline(fit2$coefficients[1:2], col='red')
            legend('topright', bty = 'n', legend = c(paste("Correlation [95% CI] = ", as.numeric(round(corrho, digits = 3)), " [", round(corconf[1],3),", ",round(corconf[2],3), "]", sep = ''), paste("Correlation p =", formatC(corp, digits = 3, format = "e"), sep = " "), paste("LM p =", formatC(flmp, digits = 3, format = "e"), sep = " "), paste("LM beta [95% CI] = ", round(flmb, digits = 3), "[", round(boot_ci[2,1], digits = 3), ", ", round(boot_ci[2,2], digits = 3), "]", sep = "")), cex = 0.7)
        dev.off()
        }
        cordata_ctrl[[i]] = as.data.frame(resdf[2:nrow(resdf),])
        rownames(cordata_ctrl[[i]]) = fibroid_in_chip$refseq_mrna
        cordata_ctrl[[i]]$gene = fibroid_in_chip$hgnc_symbol
        write.table(file = paste("./Lung_Function_Correlations/", i, "/", i, "_CTRL_LungFunction_correlations_table.txt",sep = ""), cordata_ctrl[[i]], quote = F, sep = "\t")
}

dofive = FALSE

We now generate the bubble map for these cytokines in CTRL:

alldf_ctrl = do.call(cbind, cordata_ctrl)
pdf_ctrl = alldf_ctrl[,seq(4, (10*length(cordata_ctrl)), by = 10)]
cordf_ctrl = alldf_ctrl[,seq(3, (10*length(cordata_ctrl)), by = 10)]
padjdf_ctrl = as.data.frame(t(apply(pdf_ctrl, 1, FUN = function(x) p.adjust(x, method = 'fdr'))))
colnames(cordf_ctrl) = c("DLCO", "FEV1_postBD", "FEV1_preBD", "FVC_postBD", "FVC_preBD")
rownames(fibroid_in_chip) = fibroid_in_chip$refseq_mrna
rownames(cordf_ctrl) = fibroid_in_chip[rownames(cordf_ctrl), 3]
par(mar = c(5,2,2,6))
bubbleMap(cordf_ctrl, padjdf_ctrl, color_pal = viridis(25, option = "C"), main = "Semi-partial correlation with lung function in control", cex.binned = F, cbins = 5)
## Using func as id variables

We try another type of plot that captures all the information above:

plot(x = c(do.call(c,cordf_ipf),do.call(c,cordf_ctrl)), y = -log10(c(do.call(c,padjdf_ipf), do.call(c,padjdf_ctrl))), pch = rep(1:5,each = 10) , col = c(rep('salmon', 20), rep('purple', 20)), xlab = "Spearman semi-partial correlation", ylab = "-log10(FDR) correlation test", xlim = c(-0.4, 0.2), main = "IL-6 family and lung function correlation")
abline(h = 1.3, lty = 2, col = 'gray')
abline(v = 0, lty = 2, col = 'gray')
text(x = c(do.call(c,cordf_ipf),do.call(c,cordf_ctrl)), 
    y = -log10(c(do.call(c,padjdf_ipf), do.call(c,padjdf_ctrl))), 
    labels = rep(fibroid_in_chip[rownames(padjdf_ipf),3], 10), 
    cex = 0.5, 
    pos = plotrix::thigmophobe(
        x = c(do.call(c,cordf_ipf),do.call(c,cordf_ctrl)), 
        y = -log10(c(do.call(c,padjdf_ipf), do.call(c,padjdf_ctrl)))
        )
    )
legend("topright", bty = "n", pch = c(1:5, 1,1), legend = c("DLCO %pred", "FEV1 post-BD %pred", "FEV1 pre-BD %pred", "FVC post-BD %pred", "FVC pre-BD %pred", "CTRL", "IPF"), col = c(rep('black', 5), "purple", "salmon"), pt.cex = 1, cex = 0.6, title = "Symbol legend")

While slightly more cluttered than the former plots, this scatterplot shows that IL11 is the most significant, most negatively correlated gene (amongst the 4 cytokines) with DLCO.

Fibrosis genes

We can do a similar analysis correlating the expression of IL6-family cytokines (plus IL13) to fibrosis genes: FN1, ACTA2, Collagens (1A1/1A2/2A1), TIMP1, MMP2 and MMP9. As opposed to earlier, this has to be a partial correlation analysis, since both gene expression values will have to be corrected for age and sex. We first look at their differences according to the limma analysis:

mkrs_nm_fib_l = bm[bm$hgnc_symbol %in% c("FN1", "ACTA2", "COL3A1", "COL1A1", "COL1A2", "TIMP1", "MMP2", "MMP9"),]
mkrs_nm_fib = bm[bm$refseq_mrna %in% intersect(mkrs_nm_fib_l$refseq_mrna, rownames(E3.avg$A)),]
rownames(mkrs_nm_fib) = mkrs_nm_fib$ensembl_gene_id
layout(matrix(1:8, nrow = 4))
for(i in rownames(mkrs_nm_fib))
{   
    par(mar=c(2,4,2,1))
    beeswarm(E3.avg$A[mkrs_nm_fib[i,2],targets_2$FileName] ~ targets_2$Condition, method = 'swarm', pch = 21, col = c("purple", "salmon"), cex = 0.6, bg = 'white', ylab = paste('log2(',mkrs_nm_fib[i,3],' avg exp)', sep = ""), xlab = NA, main = mkrs_nm_fib[i,3], ylim = c(0,max(E3.avg$A[mkrs_nm_fib[i,2],targets_2$FileName])+2))
    boxplot(E3.avg$A[mkrs_nm_fib[i,2],targets_2$FileName] ~ targets_2$Condition, col = NA, border = 'black', lwd = 1, pch = 21, bg = 'white', xaxt = 'n', yaxt = 'n', outline = F, boxwex = 0.5, add = T)
    segments(x0 = 1, x1 = 2, y0 = max(E3.avg$A[mkrs_nm_fib[i,2],targets_2$FileName])+1, y1 = max(E3.avg$A[mkrs_nm_fib[i,2],targets_2$FileName])+1)
    text(x = 1.5, y = max(E3.avg$A[mkrs_nm_fib[i,2],targets_2$FileName])+1.5, labels = formatC(tt_ILD_3[tt_ILD_3$SystematicName == mkrs_nm_fib[i,2],9], format = 'e', digits = 3))

}

Correlation with fibrosis genes - IPF

We then calculate gene-wise correlations in IPF:

corlist_ck_IPFonly = list()

for(j in mkrs_nm_fib$refseq_mrna)
    {           
                resdf = matrix(ncol = 4)
                Evalues = E3.avg$A[,targets_only_IPF$FileName]
                if(is.infinite(sum(as.numeric(log2(Evalues[j,]))))) next
                corlist_ck_IPFonly[[as.character( mkrs_nm_fib[mkrs_nm_fib$refseq_mrna == j,3])]] = list()
        for(i in ck_nm$refseq_mrna) 

            {
                ck_expression = Evalues[i,]
                fib_expression = Evalues[j,]
                age = as.numeric(targets_only_IPF$Age)[order(ck_expression)]
                sex2 = targets_only_IPF$Sex[order(ck_expression)]
                sex = factor(sex2, levels = c("1-Male", "2-Female"))
                fib_expression = fib_expression[order(ck_expression)]
                ck_expression = ck_expression[order(ck_expression)]
                sex2[sex2 == "1-Male"] = 0
                sex2[sex2 == "2-Female"] = 1

                ddf = data.frame("expr_c" = as.numeric(ck_expression), "expr_f" = as.numeric(fib_expression), "age" = as.numeric(age), "sex" = as.numeric(sex2))

                spct = RVAideMemoire::pcor.test(x = ddf$expr_c, y = ddf$expr_f, z = ddf[,3:4], method = 'spearman', semi = FALSE, conf.level = 0.95)
                corrho = spct$estimate
                corp = spct$p.value
                corconf = spct$conf.int


                allres = c(corrho, corp, corconf[1], corconf[2])
                resdf = rbind(resdf, allres)
                
            }
            corlist_ck_IPFonly[[as.character( mkrs_nm_fib[mkrs_nm_fib$refseq_mrna == j,3])]][[ ck_nm[ck_nm$refseq_mrna == i,3]]] = resdf[2:nrow(resdf),]
            genesymbol <-mkrs_nm_fib$hgnc_symbol[ which(mkrs_nm_fib$refseq_mrna == j)]
        write.table(file = paste("./Fibrosis_Genes_Correlations/", genesymbol, "/", genesymbol, "_IPF_Fibrosis_correlations_table.txt", sep = ""), resdf, quote = F, sep = "\t")
    }

The bubblemap reveals that IL11 is the only gene correlating with other fibrosis markers in IPF:

alldf_fib_ipf = as.data.frame(do.call(cbind, (do.call(cbind, corlist_ck_IPFonly))), stringsAsFactors = F)
rownames(alldf_fib_ipf) = ck_nm$hgnc_symbol
pdf_fib_ipf = alldf_fib_ipf[,seq(2, (4*length(corlist_ck_IPFonly)), by = 4)]
cordf_fib_ipf = alldf_fib_ipf[,seq(1, (4*length(corlist_ck_IPFonly)), by = 4)]
padjdf_fib_ipf = as.data.frame(t(apply(pdf_fib_ipf, 1, FUN = function(x) p.adjust(x, method = 'fdr'))))
colnames(cordf_fib_ipf) = mkrs_nm_fib$hgnc_symbol
par(mar = c(5,2,2,6))
bubbleMap(cordf_fib_ipf, padjdf_fib_ipf, color_pal = viridis(25, option = "D"), main = "Partial correlation with fibrosis genes in IPF", cex.binned = F, cbins = 5)
## Using func as id variables

Correlation with fibrosis genes - CTRL

We repeat the analysis in control tissues:

corlist_ck_CTRLonly = list()

for(j in mkrs_nm_fib$refseq_mrna)
    {           
                resdf = matrix(ncol = 4)
                Evalues = E3.avg$A[,targets_only_CTRL$FileName]
                if(is.infinite(sum(as.numeric(log2(Evalues[j,]))))) next
                corlist_ck_CTRLonly[[as.character( mkrs_nm_fib[mkrs_nm_fib$refseq_mrna == j,3])]] = list()
        for(i in ck_nm$refseq_mrna) 

            {
                ck_expression = Evalues[i,]
                fib_expression = Evalues[j,]
                age = as.numeric(targets_only_IPF$Age)[order(ck_expression)]
                sex2 = targets_only_IPF$Sex[order(ck_expression)]
                sex = factor(sex2, levels = c("1-Male", "2-Female"))
                fib_expression = fib_expression[order(ck_expression)]
                ck_expression = ck_expression[order(ck_expression)]
                sex2[sex2 == "1-Male"] = 0
                sex2[sex2 == "2-Female"] = 1

                ddf = data.frame("expr_c" = as.numeric(ck_expression), "expr_f" = as.numeric(fib_expression), "age" = as.numeric(age), "sex" = as.numeric(sex2))

                spct = RVAideMemoire::pcor.test(x = ddf$expr_c, y = ddf$expr_f, z = ddf[,3:4], method = 'spearman', semi = FALSE, conf.level = 0.95)
                corrho = spct$estimate
                corp = spct$p.value
                corconf = spct$conf.int

                allres = c(corrho, corp, corconf[1], corconf[2])
                resdf = rbind(resdf, allres)
                
            }
            genesymbol <- mkrs_nm_fib$hgnc_symbol[ which(mkrs_nm_fib$refseq_mrna == j)]
            corlist_ck_CTRLonly[[as.character( mkrs_nm_fib[mkrs_nm_fib$refseq_mrna == j,3])]][[ ck_nm[ck_nm$refseq_mrna == i,3]]] = resdf[2:nrow(resdf),]
        write.table(file = paste("./Fibrosis_Genes_Correlations/", genesymbol, "/", genesymbol, "_CTRL_Fibrosis_correlations_table.txt", sep = ""), resdf, quote = F, sep = "\t")
    }

Repeating the analysis only in controls reveals a different pattern, in which the only significant correlations for IL11 are with TIMP1 and COL3A1:

alldf_fib_ctrl = as.data.frame(do.call(cbind, (do.call(cbind, corlist_ck_CTRLonly))), stringsAsFactors = F)
rownames(alldf_fib_ctrl) = ck_nm$hgnc_symbol
pdf_fib_ctrl = alldf_fib_ctrl[,seq(2, (4*length(corlist_ck_CTRLonly)), by = 4)]
cordf_fib_ctrl = alldf_fib_ctrl[,seq(1, (4*length(corlist_ck_CTRLonly)), by = 4)]
padjdf_fib_ctrl = as.data.frame(t(apply(pdf_fib_ctrl, 1, FUN = function(x) p.adjust(x, method = 'fdr'))))
colnames(cordf_fib_ctrl) = mkrs_nm_fib$hgnc_symbol
par(mar = c(5,2,2,6))
bubbleMap(cordf_fib_ctrl, padjdf_fib_ctrl, color_pal = viridis(25, option = "D"), main = "Partial correlation with fibrosis genes in control", cex.binned = F, cbins = 5)
## Using func as id variables

We try again the scatterplot mixing all the information:

fibroid_in_chip_2 = fibroid_in_chip[]
plot(x = c(do.call(c,cordf_fib_ipf),do.call(c,cordf_fib_ctrl)), y = -log10(c(do.call(c,padjdf_fib_ipf), do.call(c,padjdf_fib_ctrl))), pch = rep(1:8, each = 10) , col = c(rep('salmon', 32), rep('purple', 32)), xlab = "Spearman partial correlation", ylab = "-log10(FDR) correlation test", xlim = c(-0.5, 0.5), main = "IL-6 family and fibrosis gene expression correlation")
abline(h = 1.3, lty = 2, col = 'gray')
abline(v = 0, lty = 2, col = 'gray')
text(x = c(do.call(c,cordf_fib_ipf),do.call(c,cordf_fib_ctrl)), 
    y = -log10(c(do.call(c,padjdf_fib_ipf), do.call(c,padjdf_fib_ctrl))), 
    labels = rep(fibroid_in_chip[rownames(padjdf_ipf),3], 16), 
    cex = 0.5, 
    pos = plotrix::thigmophobe(
        x = c(do.call(c,cordf_fib_ipf),do.call(c,cordf_fib_ctrl)), 
        y = -log10(c(do.call(c,padjdf_fib_ipf), do.call(c,padjdf_fib_ctrl)))
        )
    )
legend("topleft", bty = "n", pch = c(1:8, 1,1), legend = c(mkrs_nm_fib$hgnc_symbol, "CTRL", "IPF"), col = c(rep('black', 8), "purple", "salmon"), pt.cex = 1, cex = 0.6, title = "Symbol legend")

While this is far from being easily readable, we can see how IL11 correlates positively and significantly with TIMP1, MMP2, COL3A1, COL1A1, ACTA2 in IPF.

Bubblemaps using both CTRL and IPF

We plot the same bubblemaps as before, only mixing IPF and CTRL in order to have the same scaling of dots and colours. Moreover we switch to a divergent color scale, more suitable for correlation values.

Lung function bubblemap:

colnames(cordf_ctrl) <- paste0(colnames(cordf_ctrl), "_CTRL")
colnames(cordf_ipf) <- paste0(colnames(cordf_ipf), "_IPF")

colnames(padjdf_ctrl) <- paste0(colnames(padjdf_ctrl), "_CTRL")
colnames(padjdf_ipf) <- paste0(colnames(padjdf_ipf), "_IPF")

cordf_lf_both <- cbind(cordf_ctrl, cordf_ipf)
padjdf_lf_both <- cbind(padjdf_ctrl, padjdf_ipf)
rownames(padjdf_lf_both) <- rownames(cordf_lf_both)

cordf_lf_both <- cordf_lf_both[c("OSM", "IL11", "IL6", "IL13"),]
padjdf_lf_both <- padjdf_lf_both[c("OSM", "IL11", "IL6", "IL13"),]

pal = colorspace::divergingx_hcl(25, "RdBu", rev = FALSE)[4:22]

par(mar = c(8,2,2,6))
bubbleMap(cordf_lf_both, padjdf_lf_both, color_pal = pal, main = "Semipartial correlation with lung function in IPF and CTRL", cex.binned = FALSE, cbins = 5)
## Using func as id variables

Fibrosis genes bubblemap:

alldf_fib_ipf = as.data.frame(do.call(cbind, (do.call(cbind, corlist_ck_IPFonly))), stringsAsFactors = F)
alldf_fib_ctrl = as.data.frame(do.call(cbind, (do.call(cbind, corlist_ck_CTRLonly))), stringsAsFactors = F)

rownames(alldf_fib_ctrl) = ck_nm$hgnc_symbol
rownames(alldf_fib_ipf) = ck_nm$hgnc_symbol

colnames(cordf_fib_ctrl) <- paste0(colnames(cordf_fib_ctrl), "_CTRL")
colnames(cordf_fib_ipf) <- paste0(colnames(cordf_fib_ipf), "_IPF")

colnames(padjdf_fib_ctrl) <- paste0(colnames(padjdf_fib_ctrl), "_CTRL")
colnames(padjdf_fib_ipf) <- paste0(colnames(padjdf_fib_ipf), "_IPF")

cordf_both <- cbind(cordf_fib_ctrl, cordf_fib_ipf)
padjdf_both <- cbind(padjdf_fib_ctrl, padjdf_fib_ipf)

cordf_both <- cordf_both[c("OSM", "IL11", "IL6", "IL13"),]
padjdf_both <- padjdf_both[c("OSM", "IL11", "IL6", "IL13"),]

pal = colorspace::divergingx_hcl(25, "PuOr", rev = TRUE)[4:22]

par(mar = c(8,2,2,6))
bubbleMap(cordf_both, padjdf_both, color_pal = pal, main = "Partial correlation with fibrosis genes in IPF and CTRL", cex.binned = FALSE, cbins = 5)
## Using func as id variables

if(doit) save(list = ls(), file = "Microarray_Analysis_Final.RData")

Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] C/UTF-8/C/C/C/C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] robust_0.4-18        fit.models_0.5-14    viridis_0.5.1       
##  [4] viridisLite_0.3.0    pheatmap_1.0.12      gplots_3.0.1.1      
##  [7] nortest_1.0-4        beeswarm_0.2.3       biomaRt_2.38.0      
## [10] openxlsx_4.1.0       forcats_0.4.0        stringr_1.4.0       
## [13] dplyr_0.8.0.1        purrr_0.3.2          readr_1.3.1         
## [16] tidyr_0.8.3          tibble_2.1.1         ggplot2_3.1.1       
## [19] tidyverse_1.2.1      pspearman_0.3-0      RVAideMemoire_0.9-73
## [22] limma_3.38.3        
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1     rio_0.5.16           class_7.3-15        
##   [4] rstudioapi_0.10      bit64_0.9-7          AnnotationDbi_1.44.0
##   [7] prodlim_2018.04.18   mvtnorm_1.0-10       lubridate_1.7.4     
##  [10] xml2_1.2.0           codetools_0.2-16     splines_3.5.2       
##  [13] robustbase_0.93-4    knitr_1.22           jsonlite_1.6        
##  [16] caret_6.0-82         broom_0.5.2          cluster_2.0.8       
##  [19] rrcov_1.4-7          compiler_3.5.2       httr_1.4.0          
##  [22] backports_1.1.4      assertthat_0.2.1     Matrix_1.2-17       
##  [25] lazyeval_0.2.2       cli_1.1.0            htmltools_0.3.6     
##  [28] prettyunits_1.0.2    tools_3.5.2          gtable_0.3.0        
##  [31] glue_1.3.1           reshape2_1.4.3       Rcpp_1.0.1          
##  [34] carData_3.0-2        Biobase_2.42.0       cellranger_1.1.0    
##  [37] gdata_2.18.0         nlme_3.1-139         iterators_1.0.10    
##  [40] timeDate_3043.102    xfun_0.6             gower_0.2.0         
##  [43] rvest_0.3.3          gtools_3.8.1         XML_3.98-1.19       
##  [46] DEoptimR_1.0-8       MASS_7.3-51.4        scales_1.0.0        
##  [49] ipred_0.9-8          hms_0.4.2            parallel_3.5.2      
##  [52] RColorBrewer_1.1-2   yaml_2.2.0           curl_3.3            
##  [55] memoise_1.1.0        gridExtra_2.3        rpart_4.1-15        
##  [58] reshape_0.8.8        stringi_1.4.3        RSQLite_2.1.1       
##  [61] S4Vectors_0.20.1     plotrix_3.7-5        pcaPP_1.9-73        
##  [64] foreach_1.4.4        e1071_1.7-1          caTools_1.17.1.2    
##  [67] BiocGenerics_0.28.0  boot_1.3-20          zip_2.0.1           
##  [70] lava_1.6.5           rlang_0.3.4          pkgconfig_2.0.2     
##  [73] bitops_1.0-6         evaluate_0.13        lattice_0.20-38     
##  [76] recipes_0.1.5        bit_1.1-14           tidyselect_0.2.5    
##  [79] plyr_1.8.4           magrittr_1.5         R6_2.4.0            
##  [82] IRanges_2.16.0       generics_0.0.2       DBI_1.0.0           
##  [85] foreign_0.8-71       pillar_1.3.1         haven_2.1.0         
##  [88] withr_2.1.2          abind_1.4-5          survival_2.44-1.1   
##  [91] RCurl_1.95-4.12      nnet_7.3-12          car_3.0-2           
##  [94] modelr_0.1.4         crayon_1.3.4         KernSmooth_2.23-15  
##  [97] rmarkdown_1.12       progress_1.2.0       grid_3.5.2          
## [100] readxl_1.3.1         data.table_1.12.2    blob_1.1.1          
## [103] ModelMetrics_1.2.2   digest_0.6.18        stats4_3.5.2        
## [106] munsell_0.5.0